1 Proteins of Interest

proteins_of_interest <- c(
  "PDCD4",
  "LGALS1",
  "LGALS3",
  "LGALSL", #replaced galectin by the gene name
  "CCL17",
  "BCCIP",
  "CDC42",
  "CDC37",
  "CDC38",
  "CCD86",
  "LPXN",
  "PAX",
  "CYTB",
  "SWI/SNF",
  "NFKB1",
  "TNFAIP8",
  "ST13",
  "NEUA",
  "CYC1",
  "PDCD5",
  "PDCD6",
  "PDCD6IP",
  "SERPINB6",
  "CD63",
  "CD70",
  "LCA",
  "NUDC",
  "BAG6",
  "BCLAF1",
  "JAK",
  "STAT6",
  "STAT3",
  "STAT5A",
  "STAT1",
  "NUDT5",
  "PRCP",
  "SMARCC2",
  "BANF1",
  "BAF", # added baf because BANF1 = BAF is not possible
  "MANF",
  "NENF",
  "DDB1",
  "ARMT1",
  "FEN1",
  "APEX1",
  "PAFAH1B1",
  "EEF1E1",
  "HDGF",
  "GRB2",
  "MYDGF",
  "OGFR",
  "CCAR1",
  "CCAR2",
  "AIFM1",
  "API5",
  "ACIN1",
  "RBM4",
  "SDHB",
  "PURA",
  "SDHA",
  "DLST",
  "SUCLG1",
  "OXCT1",
  "OGDH",
  "SUCLG2",
  "SUCLA2",
  "ALDH",
  "SOD2",
  "MCM2",
  "ACY1",
  "GATM",
  "HEATR",
  "NUP210",
  "NUP214",
  "VDAC1",
  "VDAC2",
  "VDAC3",
  "BCCP",
  "HK2",
  "HK1",
  "PARP1",
  "PARP4",
  "H2AFX",
  "PCYT1A",
  "HSPA1B",
  "CASP3",
  "SLC4A7",
  "EPS15L1",
  "KCNAB2",
  "TNFRSF8",
  "CCBL2",
  "GAR1"
)

2 Load Libraries & Plot function & Colors

3 Functions

############################################################
# Function: run_deqms_pipeline
# Purpose:  Fit limma model, apply DEqMS (spectraCounteBayes),
#           and show the variance boxplot.
############################################################

run_deqms_pipeline <- function(protein.matrix, design, pep.count.table,
                               contrast_string = "classC-classE",
                               main_title = "Label-free dataset") {
  
  # Step 1: Fit linear model with limma
  fit1 <- limma::lmFit(protein.matrix, design = design)
  
  # Step 2: Define contrasts
  cont <- limma::makeContrasts(contrasts = contrast_string, levels = design)
  fit2 <- limma::contrasts.fit(fit1, contrasts = cont)
  
  # Step 3: Empirical Bayes shrinkage
  fit3 <- limma::eBayes(fit2)
  fit3$sigma[which(fit3$sigma == 0)] <- 1e-14
  
  # Step 4: Attach peptide counts
  fit3$count <- pep.count.table[rownames(fit3$coefficients), "count"]
  
  # Validate peptide count input
  if (any(is.na(fit3$count)) || min(fit3$count, na.rm = TRUE) <= 0) {
    stop("Error: At least one entry in 'pep.count.table$count' is NA or <= 0. Please verify inputs.")
  }
  
  # Step 5: DEqMS modeling
  fit4 <- DEqMS::spectraCounteBayes(fit3)
  
  # Step 6: Visualization
  DEqMS::VarianceBoxplot(fit4, n = 20,
                         main = main_title,
                         xlab = "peptide count + 1")
  
  return(fit4)
}


############################################################
# Function: plot_volcano
# Purpose:  Create a volcano plot using log2 fold change and q-values
############################################################

plot_volcano <- function(res, title = "", lfc_limit = NA) {
  res.plot <- res %>%
    dplyr::filter(!is.na(adj.P.Val)) %>%
    dplyr::arrange(adj.P.Val) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(
      threshold = adj.P.Val < 0.05 & abs(logFC) > 1.0,
      out_of_bounds = ifelse(is.na(lfc_limit), 0, (abs(logFC) > lfc_limit) * sign(logFC)),
      logFC_capped = ifelse(out_of_bounds != 0, lfc_limit * sign(logFC), logFC)
    ) %>%
    dplyr::ungroup()
  
  res.plot %<>% 
    dplyr::mutate(genelabels = ifelse(threshold, gene, ""))
  
  maxFC <- max(abs(res.plot$logFC))
  if (!is.na(lfc_limit) && lfc_limit < maxFC) {
    xlim <- lfc_limit
  } else {
    xlim <- maxFC * 1.04
  }
  
  ggplot2::ggplot(res.plot, ggplot2::aes(x = logFC_capped, y = adj.P.Val)) +
    ggplot2::geom_point(data = subset(res.plot, out_of_bounds == 0),
                        ggplot2::aes(colour = threshold), alpha = 0.5) +
    ggplot2::geom_point(data = subset(res.plot, out_of_bounds == -1),
                        ggplot2::aes(colour = threshold), shape = "\u25c4", size = 2) +
    ggplot2::geom_point(data = subset(res.plot, out_of_bounds == 1),
                        ggplot2::aes(colour = threshold), shape = "\u25BA", size = 2) +
    ggplot2::geom_hline(yintercept = 0.05, linetype = 2) +
    ggplot2::geom_vline(xintercept = c(-1, 1), linetype = 2) +
    ggplot2::scale_x_continuous(breaks = scales::pretty_breaks(),
                                limits = c(-xlim, xlim),
                                expand = ggplot2::expansion(0.01)) +
    ggplot2::scale_y_continuous(trans = c("log10", "reverse"),
                                breaks = scales::log_breaks(),
                                labels = scales::scientific) +
    ggplot2::ggtitle(title) +
    ggplot2::xlab("log2 Fold Change") +
    ggplot2::ylab("q-value") +
    ggplot2::theme_minimal() +
    ggplot2::theme(legend.position = "none")
}


############################################################
# Function: extract_gene_names
# Purpose:  Parse FASTA headers and extract gene names (GN=... entries)
############################################################

extract_gene_names <- function(fasta_headers) {
  sapply(fasta_headers, function(x) {
    if (is.na(x) || x == "") return(NA_character_)  # Handle empty or NA lines
    
    # Find all GN=... matches
    genes <- unlist(regmatches(x, gregexpr("GN=([^ ]+)", x)))
    if (length(genes) == 0) return(NA_character_)  # No GN found
    
    # Remove "GN=" prefix
    genes <- gsub("GN=", "", genes)
    
    # Remove duplicates and collapse multiple entries
    paste(unique(genes), collapse = ";")
  })
}

4 C vs E

############################################################
# C vs E
############################################################

# Load MaxQuant output
df <- read.csv("./results_maxquant/hl540_le_galaxy.txt", sep = "\t")
df.prot <- df

# Remove reverse/contaminant/site-only identifications
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]

# Extract gene names from FASTA headers
df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)

# Extract LFQ intensity columns (as provided)
df.LFQ <- df.prot[, c(320:334, 335:349)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
##  [1] "LFQ.intensity.HL_450_LE_C1_B_01.raw.thermo.raw"
##  [2] "LFQ.intensity.HL_450_LE_C1_B_02.raw.thermo.raw"
##  [3] "LFQ.intensity.HL_450_LE_C1_B_03.raw.thermo.raw"
##  [4] "LFQ.intensity.HL_540_LE_C1_A_01.raw.thermo.raw"
##  [5] "LFQ.intensity.HL_540_LE_C1_A_02.raw.thermo.raw"
##  [6] "LFQ.intensity.HL_540_LE_C1_A_03.raw.thermo.raw"
##  [7] "LFQ.intensity.HL_540_LE_C1_C_01.raw.thermo.raw"
##  [8] "LFQ.intensity.HL_540_LE_C1_C_02.raw.thermo.raw"
##  [9] "LFQ.intensity.HL_540_LE_C1_C_03.raw.thermo.raw"
## [10] "LFQ.intensity.HL_540_LE_C2_A_01.raw.thermo.raw"
## [11] "LFQ.intensity.HL_540_LE_C2_A_02.raw.thermo.raw"
## [12] "LFQ.intensity.HL_540_LE_C2_A_03.raw.thermo.raw"
## [13] "LFQ.intensity.HL_540_LE_C2_B_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_LE_C2_B_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_LE_C2_B_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_LE_E1_A_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_LE_E1_A_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_LE_E1_A_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_LE_E1_B_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_LE_E1_B_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_LE_E1_B_03.raw.thermo.raw"
## [22] "LFQ.intensity.HL_540_LE_E1_C_01.raw.thermo.raw"
## [23] "LFQ.intensity.HL_540_LE_E1_C_02.raw.thermo.raw"
## [24] "LFQ.intensity.HL_540_LE_E1_C_03.raw.thermo.raw"
## [25] "LFQ.intensity.HL_540_LE_E2_A_01.raw.thermo.raw"
## [26] "LFQ.intensity.HL_540_LE_E2_A_02.raw.thermo.raw"
## [27] "LFQ.intensity.HL_540_LE_E2_A_03.raw.thermo.raw"
## [28] "LFQ.intensity.HL_540_LE_E2_B_01.raw.thermo.raw"
## [29] "LFQ.intensity.HL_540_LE_E2_B_02.raw.thermo.raw"
## [30] "LFQ.intensity.HL_540_LE_E2_B_03.raw.thermo.raw"
# Set protein IDs as row names
rownames(df.LFQ) <- df.prot$Majority.protein.IDs

# Transpose to samples x proteins
df.LFQ <- t(df.LFQ)

# Track all-NA protein columns to align peptide-counts later
keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)

# Remove proteins with all NA
df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]

# Log2 transform
df.LFQ.clean <- log2(df.LFQ.clean)

# Impute missing values (local least squares)
result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))

# Presence filters per group
df.LFQ$valid_A <- apply(df.LFQ[, 1:15], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[,16:30], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 1 | df.LFQ$valid_B >= 2, 1:30]

# Inspect peptide-count columns corresponding to these samples
print(colnames(df.prot[, c(71:85, 86:100)]))
##  [1] "Razor...unique.peptides.HL_450_LE_C1_B_01.raw.thermo.raw"
##  [2] "Razor...unique.peptides.HL_450_LE_C1_B_02.raw.thermo.raw"
##  [3] "Razor...unique.peptides.HL_450_LE_C1_B_03.raw.thermo.raw"
##  [4] "Razor...unique.peptides.HL_540_LE_C1_A_01.raw.thermo.raw"
##  [5] "Razor...unique.peptides.HL_540_LE_C1_A_02.raw.thermo.raw"
##  [6] "Razor...unique.peptides.HL_540_LE_C1_A_03.raw.thermo.raw"
##  [7] "Razor...unique.peptides.HL_540_LE_C1_C_01.raw.thermo.raw"
##  [8] "Razor...unique.peptides.HL_540_LE_C1_C_02.raw.thermo.raw"
##  [9] "Razor...unique.peptides.HL_540_LE_C1_C_03.raw.thermo.raw"
## [10] "Razor...unique.peptides.HL_540_LE_C2_A_01.raw.thermo.raw"
## [11] "Razor...unique.peptides.HL_540_LE_C2_A_02.raw.thermo.raw"
## [12] "Razor...unique.peptides.HL_540_LE_C2_A_03.raw.thermo.raw"
## [13] "Razor...unique.peptides.HL_540_LE_C2_B_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_LE_C2_B_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_LE_C2_B_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_LE_E1_A_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_LE_E1_A_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_LE_E1_A_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_LE_E1_B_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_LE_E1_B_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_LE_E1_B_03.raw.thermo.raw"
## [22] "Razor...unique.peptides.HL_540_LE_E1_C_01.raw.thermo.raw"
## [23] "Razor...unique.peptides.HL_540_LE_E1_C_02.raw.thermo.raw"
## [24] "Razor...unique.peptides.HL_540_LE_E1_C_03.raw.thermo.raw"
## [25] "Razor...unique.peptides.HL_540_LE_E2_A_01.raw.thermo.raw"
## [26] "Razor...unique.peptides.HL_540_LE_E2_A_02.raw.thermo.raw"
## [27] "Razor...unique.peptides.HL_540_LE_E2_A_03.raw.thermo.raw"
## [28] "Razor...unique.peptides.HL_540_LE_E2_B_01.raw.thermo.raw"
## [29] "Razor...unique.peptides.HL_540_LE_E2_B_02.raw.thermo.raw"
## [30] "Razor...unique.peptides.HL_540_LE_E2_B_03.raw.thermo.raw"
# Build peptide-count table (unique+razor peptides; minimum across samples)
pep.count.table <- data.frame(
  count     = rowMins(as.matrix(df.prot[-removed_col_indices, c(71:85, 86:100)])),
  row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
# Add pseudocount to avoid zeros
pep.count.table$count <- pep.count.table$count + 1

# Matrix for limma/DEqMS
protein.matrix <- as.matrix(df.LFQ.filter)

# Grouping factors: class and replicate (batch)
class     <- factor(c(rep("C", 15), rep("E", 15)))
replicate <- factor(c(rep("rep1", 9), rep("rep2", 6),
                      rep("rep1", 9), rep("rep2", 6)))

# Design without intercept; replicate as covariate
design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classC", "classE")

# Optional explicit contrast (pipeline uses contrast_string)
cont <- limma::makeContrasts(classC - classE, levels = design)

# Run DEqMS pipeline
fit_result <- run_deqms_pipeline(
  protein.matrix, design, pep.count.table,
  contrast_string = "classC-classE"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 3.305e-16

# Collect results and add gene names
DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names

# Export with timestamp
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
          file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_le/data_", timestamp, ".csv"))

# Clean result table for display
processed_df <- DEqMS.results %>%
  dplyr::filter(!is.na(logFC)) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
DT::datatable(processed_df, rownames = FALSE)
# Subset: proteins of interest (expects 'proteins_of_interest' to exist)
processed_df_interest <- processed_df %>%
  dplyr::filter(Gene.name %in% proteins_of_interest)
DT::datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
# Volcano plot
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "C vs E", vp_lfc_limit)

# Counts of strongly regulated proteins
sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up    <- sum(sig$logFC >=  1.0, na.rm = TRUE)
n_down  <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 52
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 39
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 91

5 C vs RE

############################################################
# C vs RE
############################################################
df <- read.csv("./results_maxquant/hl540_le_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]

df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)

df.LFQ <- df.prot[, c(320:334, 365:379)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
##  [1] "LFQ.intensity.HL_450_LE_C1_B_01.raw.thermo.raw" 
##  [2] "LFQ.intensity.HL_450_LE_C1_B_02.raw.thermo.raw" 
##  [3] "LFQ.intensity.HL_450_LE_C1_B_03.raw.thermo.raw" 
##  [4] "LFQ.intensity.HL_540_LE_C1_A_01.raw.thermo.raw" 
##  [5] "LFQ.intensity.HL_540_LE_C1_A_02.raw.thermo.raw" 
##  [6] "LFQ.intensity.HL_540_LE_C1_A_03.raw.thermo.raw" 
##  [7] "LFQ.intensity.HL_540_LE_C1_C_01.raw.thermo.raw" 
##  [8] "LFQ.intensity.HL_540_LE_C1_C_02.raw.thermo.raw" 
##  [9] "LFQ.intensity.HL_540_LE_C1_C_03.raw.thermo.raw" 
## [10] "LFQ.intensity.HL_540_LE_C2_A_01.raw.thermo.raw" 
## [11] "LFQ.intensity.HL_540_LE_C2_A_02.raw.thermo.raw" 
## [12] "LFQ.intensity.HL_540_LE_C2_A_03.raw.thermo.raw" 
## [13] "LFQ.intensity.HL_540_LE_C2_B_01.raw.thermo.raw" 
## [14] "LFQ.intensity.HL_540_LE_C2_B_02.raw.thermo.raw" 
## [15] "LFQ.intensity.HL_540_LE_C2_B_03.raw.thermo.raw" 
## [16] "LFQ.intensity.HL_540_LE_RE1_A_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_LE_RE1_A_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_LE_RE1_A_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_LE_RE1_B_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_LE_RE1_B_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_LE_RE1_B_03.raw.thermo.raw"
## [22] "LFQ.intensity.HL_540_LE_RE1_C_01.raw.thermo.raw"
## [23] "LFQ.intensity.HL_540_LE_RE1_C_02.raw.thermo.raw"
## [24] "LFQ.intensity.HL_540_LE_RE1_C_03.raw.thermo.raw"
## [25] "LFQ.intensity.HL_540_LE_RE2_A_01.raw.thermo.raw"
## [26] "LFQ.intensity.HL_540_LE_RE2_A_02.raw.thermo.raw"
## [27] "LFQ.intensity.HL_540_LE_RE2_A_03.raw.thermo.raw"
## [28] "LFQ.intensity.HL_540_LE_RE2_B_01.raw.thermo.raw"
## [29] "LFQ.intensity.HL_540_LE_RE2_B_02.raw.thermo.raw"
## [30] "LFQ.intensity.HL_540_LE_RE2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)

keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)

df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)

result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))

df.LFQ$valid_A <- apply(df.LFQ[, 1:15], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 16:30], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 1 | df.LFQ$valid_B >= 2, 1:30]

print(colnames(df.prot[, c(71:85, 116:130)]))
##  [1] "Razor...unique.peptides.HL_450_LE_C1_B_01.raw.thermo.raw" 
##  [2] "Razor...unique.peptides.HL_450_LE_C1_B_02.raw.thermo.raw" 
##  [3] "Razor...unique.peptides.HL_450_LE_C1_B_03.raw.thermo.raw" 
##  [4] "Razor...unique.peptides.HL_540_LE_C1_A_01.raw.thermo.raw" 
##  [5] "Razor...unique.peptides.HL_540_LE_C1_A_02.raw.thermo.raw" 
##  [6] "Razor...unique.peptides.HL_540_LE_C1_A_03.raw.thermo.raw" 
##  [7] "Razor...unique.peptides.HL_540_LE_C1_C_01.raw.thermo.raw" 
##  [8] "Razor...unique.peptides.HL_540_LE_C1_C_02.raw.thermo.raw" 
##  [9] "Razor...unique.peptides.HL_540_LE_C1_C_03.raw.thermo.raw" 
## [10] "Razor...unique.peptides.HL_540_LE_C2_A_01.raw.thermo.raw" 
## [11] "Razor...unique.peptides.HL_540_LE_C2_A_02.raw.thermo.raw" 
## [12] "Razor...unique.peptides.HL_540_LE_C2_A_03.raw.thermo.raw" 
## [13] "Razor...unique.peptides.HL_540_LE_C2_B_01.raw.thermo.raw" 
## [14] "Razor...unique.peptides.HL_540_LE_C2_B_02.raw.thermo.raw" 
## [15] "Razor...unique.peptides.HL_540_LE_C2_B_03.raw.thermo.raw" 
## [16] "Razor...unique.peptides.HL_540_LE_RE1_A_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_LE_RE1_A_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_LE_RE1_A_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_LE_RE1_B_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_LE_RE1_B_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_LE_RE1_B_03.raw.thermo.raw"
## [22] "Razor...unique.peptides.HL_540_LE_RE1_C_01.raw.thermo.raw"
## [23] "Razor...unique.peptides.HL_540_LE_RE1_C_02.raw.thermo.raw"
## [24] "Razor...unique.peptides.HL_540_LE_RE1_C_03.raw.thermo.raw"
## [25] "Razor...unique.peptides.HL_540_LE_RE2_A_01.raw.thermo.raw"
## [26] "Razor...unique.peptides.HL_540_LE_RE2_A_02.raw.thermo.raw"
## [27] "Razor...unique.peptides.HL_540_LE_RE2_A_03.raw.thermo.raw"
## [28] "Razor...unique.peptides.HL_540_LE_RE2_B_01.raw.thermo.raw"
## [29] "Razor...unique.peptides.HL_540_LE_RE2_B_02.raw.thermo.raw"
## [30] "Razor...unique.peptides.HL_540_LE_RE2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
  count     = rowMins(as.matrix(df.prot[-removed_col_indices, c(71:85, 116:130)])),
  row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1

protein.matrix <- as.matrix(df.LFQ.filter)

class     <- factor(c(rep("C", 15), rep("RE", 15)))
replicate <- factor(c(rep("rep1", 9), rep("rep2", 6),
                      rep("rep1", 9), rep("rep2", 6)))

design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classC", "classRE")

fit_result <- run_deqms_pipeline(
  protein.matrix, design, pep.count.table,
  contrast_string = "classC-classRE"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1.3179e-16

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names

timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
          file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_le/data_", timestamp, ".csv"))

processed_df <- DEqMS.results %>%
  dplyr::filter(!is.na(logFC)) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
DT::datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
  dplyr::filter(Gene.name %in% proteins_of_interest)
DT::datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "C vs RE", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up    <- sum(sig$logFC >=  1.0, na.rm = TRUE)
n_down  <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 28
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 33
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 61

6 C vs R

############################################################
# C vs R
############################################################

df <- read.csv("./results_maxquant/hl540_le_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]

df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)

df.LFQ <- df.prot[, c(320:334, 350:364)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
##  [1] "LFQ.intensity.HL_450_LE_C1_B_01.raw.thermo.raw"
##  [2] "LFQ.intensity.HL_450_LE_C1_B_02.raw.thermo.raw"
##  [3] "LFQ.intensity.HL_450_LE_C1_B_03.raw.thermo.raw"
##  [4] "LFQ.intensity.HL_540_LE_C1_A_01.raw.thermo.raw"
##  [5] "LFQ.intensity.HL_540_LE_C1_A_02.raw.thermo.raw"
##  [6] "LFQ.intensity.HL_540_LE_C1_A_03.raw.thermo.raw"
##  [7] "LFQ.intensity.HL_540_LE_C1_C_01.raw.thermo.raw"
##  [8] "LFQ.intensity.HL_540_LE_C1_C_02.raw.thermo.raw"
##  [9] "LFQ.intensity.HL_540_LE_C1_C_03.raw.thermo.raw"
## [10] "LFQ.intensity.HL_540_LE_C2_A_01.raw.thermo.raw"
## [11] "LFQ.intensity.HL_540_LE_C2_A_02.raw.thermo.raw"
## [12] "LFQ.intensity.HL_540_LE_C2_A_03.raw.thermo.raw"
## [13] "LFQ.intensity.HL_540_LE_C2_B_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_LE_C2_B_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_LE_C2_B_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_LE_R1_A_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_LE_R1_A_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_LE_R1_A_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_LE_R1_B_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_LE_R1_B_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_LE_R1_B_03.raw.thermo.raw"
## [22] "LFQ.intensity.HL_540_LE_R1_C_01.raw.thermo.raw"
## [23] "LFQ.intensity.HL_540_LE_R1_C_02.raw.thermo.raw"
## [24] "LFQ.intensity.HL_540_LE_R1_C_03.raw.thermo.raw"
## [25] "LFQ.intensity.HL_540_LE_R2_A_01.raw.thermo.raw"
## [26] "LFQ.intensity.HL_540_LE_R2_A_02.raw.thermo.raw"
## [27] "LFQ.intensity.HL_540_LE_R2_A_03.raw.thermo.raw"
## [28] "LFQ.intensity.HL_540_LE_R2_B_01.raw.thermo.raw"
## [29] "LFQ.intensity.HL_540_LE_R2_B_02.raw.thermo.raw"
## [30] "LFQ.intensity.HL_540_LE_R2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)

keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)

df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)

result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))

df.LFQ$valid_A <- apply(df.LFQ[, 1:15], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 16:30], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 1 | df.LFQ$valid_B >= 2, 1:30]

print(colnames(df.prot[, c(71:85, 101:115)]))
##  [1] "Razor...unique.peptides.HL_450_LE_C1_B_01.raw.thermo.raw"
##  [2] "Razor...unique.peptides.HL_450_LE_C1_B_02.raw.thermo.raw"
##  [3] "Razor...unique.peptides.HL_450_LE_C1_B_03.raw.thermo.raw"
##  [4] "Razor...unique.peptides.HL_540_LE_C1_A_01.raw.thermo.raw"
##  [5] "Razor...unique.peptides.HL_540_LE_C1_A_02.raw.thermo.raw"
##  [6] "Razor...unique.peptides.HL_540_LE_C1_A_03.raw.thermo.raw"
##  [7] "Razor...unique.peptides.HL_540_LE_C1_C_01.raw.thermo.raw"
##  [8] "Razor...unique.peptides.HL_540_LE_C1_C_02.raw.thermo.raw"
##  [9] "Razor...unique.peptides.HL_540_LE_C1_C_03.raw.thermo.raw"
## [10] "Razor...unique.peptides.HL_540_LE_C2_A_01.raw.thermo.raw"
## [11] "Razor...unique.peptides.HL_540_LE_C2_A_02.raw.thermo.raw"
## [12] "Razor...unique.peptides.HL_540_LE_C2_A_03.raw.thermo.raw"
## [13] "Razor...unique.peptides.HL_540_LE_C2_B_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_LE_C2_B_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_LE_C2_B_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_LE_R1_A_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_LE_R1_A_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_LE_R1_A_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_LE_R1_B_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_LE_R1_B_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_LE_R1_B_03.raw.thermo.raw"
## [22] "Razor...unique.peptides.HL_540_LE_R1_C_01.raw.thermo.raw"
## [23] "Razor...unique.peptides.HL_540_LE_R1_C_02.raw.thermo.raw"
## [24] "Razor...unique.peptides.HL_540_LE_R1_C_03.raw.thermo.raw"
## [25] "Razor...unique.peptides.HL_540_LE_R2_A_01.raw.thermo.raw"
## [26] "Razor...unique.peptides.HL_540_LE_R2_A_02.raw.thermo.raw"
## [27] "Razor...unique.peptides.HL_540_LE_R2_A_03.raw.thermo.raw"
## [28] "Razor...unique.peptides.HL_540_LE_R2_B_01.raw.thermo.raw"
## [29] "Razor...unique.peptides.HL_540_LE_R2_B_02.raw.thermo.raw"
## [30] "Razor...unique.peptides.HL_540_LE_R2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
  count     = rowMins(as.matrix(df.prot[-removed_col_indices, c(71:85, 101:115)])),
  row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1

protein.matrix <- as.matrix(df.LFQ.filter)

class     <- factor(c(rep("C", 15), rep("R", 15)))
replicate <- factor(c(rep("rep1", 9), rep("rep2", 6),
                      rep("rep1", 9), rep("rep2", 6)))

design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classC", "classR")

fit_result <- run_deqms_pipeline(
  protein.matrix, design, pep.count.table,
  contrast_string = "classC-classR"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 3.0168e-16

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names

timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
          file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_le/data_", timestamp, ".csv"))

processed_df <- DEqMS.results %>%
  dplyr::filter(!is.na(logFC)) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
DT::datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
  dplyr::filter(Gene.name %in% proteins_of_interest)
DT::datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "C vs R", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up    <- sum(sig$logFC >=  1.0, na.rm = TRUE)
n_down  <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 24
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 38
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 62

7 E vs R

############################################################
# E vs R
############################################################

df <- read.csv("./results_maxquant/hl540_le_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]

df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)

df.LFQ <- df.prot[, c(335:349, 350:364)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
##  [1] "LFQ.intensity.HL_540_LE_E1_A_01.raw.thermo.raw"
##  [2] "LFQ.intensity.HL_540_LE_E1_A_02.raw.thermo.raw"
##  [3] "LFQ.intensity.HL_540_LE_E1_A_03.raw.thermo.raw"
##  [4] "LFQ.intensity.HL_540_LE_E1_B_01.raw.thermo.raw"
##  [5] "LFQ.intensity.HL_540_LE_E1_B_02.raw.thermo.raw"
##  [6] "LFQ.intensity.HL_540_LE_E1_B_03.raw.thermo.raw"
##  [7] "LFQ.intensity.HL_540_LE_E1_C_01.raw.thermo.raw"
##  [8] "LFQ.intensity.HL_540_LE_E1_C_02.raw.thermo.raw"
##  [9] "LFQ.intensity.HL_540_LE_E1_C_03.raw.thermo.raw"
## [10] "LFQ.intensity.HL_540_LE_E2_A_01.raw.thermo.raw"
## [11] "LFQ.intensity.HL_540_LE_E2_A_02.raw.thermo.raw"
## [12] "LFQ.intensity.HL_540_LE_E2_A_03.raw.thermo.raw"
## [13] "LFQ.intensity.HL_540_LE_E2_B_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_LE_E2_B_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_LE_E2_B_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_LE_R1_A_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_LE_R1_A_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_LE_R1_A_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_LE_R1_B_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_LE_R1_B_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_LE_R1_B_03.raw.thermo.raw"
## [22] "LFQ.intensity.HL_540_LE_R1_C_01.raw.thermo.raw"
## [23] "LFQ.intensity.HL_540_LE_R1_C_02.raw.thermo.raw"
## [24] "LFQ.intensity.HL_540_LE_R1_C_03.raw.thermo.raw"
## [25] "LFQ.intensity.HL_540_LE_R2_A_01.raw.thermo.raw"
## [26] "LFQ.intensity.HL_540_LE_R2_A_02.raw.thermo.raw"
## [27] "LFQ.intensity.HL_540_LE_R2_A_03.raw.thermo.raw"
## [28] "LFQ.intensity.HL_540_LE_R2_B_01.raw.thermo.raw"
## [29] "LFQ.intensity.HL_540_LE_R2_B_02.raw.thermo.raw"
## [30] "LFQ.intensity.HL_540_LE_R2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)

keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)

df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)

result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))

df.LFQ$valid_A <- apply(df.LFQ[, 1:15], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 15:30], 1, function(x) sum(!is.na(x)))   # as provided
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 2 | df.LFQ$valid_B >= 2, 1:30]

print(colnames(df.prot[, c(86:100, 101:115)]))
##  [1] "Razor...unique.peptides.HL_540_LE_E1_A_01.raw.thermo.raw"
##  [2] "Razor...unique.peptides.HL_540_LE_E1_A_02.raw.thermo.raw"
##  [3] "Razor...unique.peptides.HL_540_LE_E1_A_03.raw.thermo.raw"
##  [4] "Razor...unique.peptides.HL_540_LE_E1_B_01.raw.thermo.raw"
##  [5] "Razor...unique.peptides.HL_540_LE_E1_B_02.raw.thermo.raw"
##  [6] "Razor...unique.peptides.HL_540_LE_E1_B_03.raw.thermo.raw"
##  [7] "Razor...unique.peptides.HL_540_LE_E1_C_01.raw.thermo.raw"
##  [8] "Razor...unique.peptides.HL_540_LE_E1_C_02.raw.thermo.raw"
##  [9] "Razor...unique.peptides.HL_540_LE_E1_C_03.raw.thermo.raw"
## [10] "Razor...unique.peptides.HL_540_LE_E2_A_01.raw.thermo.raw"
## [11] "Razor...unique.peptides.HL_540_LE_E2_A_02.raw.thermo.raw"
## [12] "Razor...unique.peptides.HL_540_LE_E2_A_03.raw.thermo.raw"
## [13] "Razor...unique.peptides.HL_540_LE_E2_B_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_LE_E2_B_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_LE_E2_B_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_LE_R1_A_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_LE_R1_A_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_LE_R1_A_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_LE_R1_B_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_LE_R1_B_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_LE_R1_B_03.raw.thermo.raw"
## [22] "Razor...unique.peptides.HL_540_LE_R1_C_01.raw.thermo.raw"
## [23] "Razor...unique.peptides.HL_540_LE_R1_C_02.raw.thermo.raw"
## [24] "Razor...unique.peptides.HL_540_LE_R1_C_03.raw.thermo.raw"
## [25] "Razor...unique.peptides.HL_540_LE_R2_A_01.raw.thermo.raw"
## [26] "Razor...unique.peptides.HL_540_LE_R2_A_02.raw.thermo.raw"
## [27] "Razor...unique.peptides.HL_540_LE_R2_A_03.raw.thermo.raw"
## [28] "Razor...unique.peptides.HL_540_LE_R2_B_01.raw.thermo.raw"
## [29] "Razor...unique.peptides.HL_540_LE_R2_B_02.raw.thermo.raw"
## [30] "Razor...unique.peptides.HL_540_LE_R2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
  count     = rowMins(as.matrix(df.prot[-removed_col_indices, c(86:100, 101:115)])),
  row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1

protein.matrix <- as.matrix(df.LFQ.filter)

class     <- factor(c(rep("E", 15), rep("R", 15)))
replicate <- factor(c(rep("rep1", 9), rep("rep2", 6),
                      rep("rep1", 9), rep("rep2", 6)))

design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classE", "classR")

fit_result <- run_deqms_pipeline(
  protein.matrix, design, pep.count.table,
  contrast_string = "classE-classR"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 2.5703e-16

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names

timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
          file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_le/data_", timestamp, ".csv"))

processed_df <- DEqMS.results %>%
  dplyr::filter(!is.na(logFC)) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
DT::datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
  dplyr::filter(Gene.name %in% proteins_of_interest)
DT::datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "E vs R", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up    <- sum(sig$logFC >=  1.0, na.rm = TRUE)
n_down  <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 37
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 63
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 100

8 E vs RE

############################################################
# E vs RE
############################################################

df <- read.csv("./results_maxquant/hl540_le_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]

df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)

df.LFQ <- df.prot[, c(335:349, 365:379)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
##  [1] "LFQ.intensity.HL_540_LE_E1_A_01.raw.thermo.raw" 
##  [2] "LFQ.intensity.HL_540_LE_E1_A_02.raw.thermo.raw" 
##  [3] "LFQ.intensity.HL_540_LE_E1_A_03.raw.thermo.raw" 
##  [4] "LFQ.intensity.HL_540_LE_E1_B_01.raw.thermo.raw" 
##  [5] "LFQ.intensity.HL_540_LE_E1_B_02.raw.thermo.raw" 
##  [6] "LFQ.intensity.HL_540_LE_E1_B_03.raw.thermo.raw" 
##  [7] "LFQ.intensity.HL_540_LE_E1_C_01.raw.thermo.raw" 
##  [8] "LFQ.intensity.HL_540_LE_E1_C_02.raw.thermo.raw" 
##  [9] "LFQ.intensity.HL_540_LE_E1_C_03.raw.thermo.raw" 
## [10] "LFQ.intensity.HL_540_LE_E2_A_01.raw.thermo.raw" 
## [11] "LFQ.intensity.HL_540_LE_E2_A_02.raw.thermo.raw" 
## [12] "LFQ.intensity.HL_540_LE_E2_A_03.raw.thermo.raw" 
## [13] "LFQ.intensity.HL_540_LE_E2_B_01.raw.thermo.raw" 
## [14] "LFQ.intensity.HL_540_LE_E2_B_02.raw.thermo.raw" 
## [15] "LFQ.intensity.HL_540_LE_E2_B_03.raw.thermo.raw" 
## [16] "LFQ.intensity.HL_540_LE_RE1_A_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_LE_RE1_A_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_LE_RE1_A_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_LE_RE1_B_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_LE_RE1_B_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_LE_RE1_B_03.raw.thermo.raw"
## [22] "LFQ.intensity.HL_540_LE_RE1_C_01.raw.thermo.raw"
## [23] "LFQ.intensity.HL_540_LE_RE1_C_02.raw.thermo.raw"
## [24] "LFQ.intensity.HL_540_LE_RE1_C_03.raw.thermo.raw"
## [25] "LFQ.intensity.HL_540_LE_RE2_A_01.raw.thermo.raw"
## [26] "LFQ.intensity.HL_540_LE_RE2_A_02.raw.thermo.raw"
## [27] "LFQ.intensity.HL_540_LE_RE2_A_03.raw.thermo.raw"
## [28] "LFQ.intensity.HL_540_LE_RE2_B_01.raw.thermo.raw"
## [29] "LFQ.intensity.HL_540_LE_RE2_B_02.raw.thermo.raw"
## [30] "LFQ.intensity.HL_540_LE_RE2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)

keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)

df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)

result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))

df.LFQ$valid_A <- apply(df.LFQ[, 1:15], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 16:30], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 2 | df.LFQ$valid_B >= 2, 1:30]

print(colnames(df.prot[, c(86:100, 116:130)]))
##  [1] "Razor...unique.peptides.HL_540_LE_E1_A_01.raw.thermo.raw" 
##  [2] "Razor...unique.peptides.HL_540_LE_E1_A_02.raw.thermo.raw" 
##  [3] "Razor...unique.peptides.HL_540_LE_E1_A_03.raw.thermo.raw" 
##  [4] "Razor...unique.peptides.HL_540_LE_E1_B_01.raw.thermo.raw" 
##  [5] "Razor...unique.peptides.HL_540_LE_E1_B_02.raw.thermo.raw" 
##  [6] "Razor...unique.peptides.HL_540_LE_E1_B_03.raw.thermo.raw" 
##  [7] "Razor...unique.peptides.HL_540_LE_E1_C_01.raw.thermo.raw" 
##  [8] "Razor...unique.peptides.HL_540_LE_E1_C_02.raw.thermo.raw" 
##  [9] "Razor...unique.peptides.HL_540_LE_E1_C_03.raw.thermo.raw" 
## [10] "Razor...unique.peptides.HL_540_LE_E2_A_01.raw.thermo.raw" 
## [11] "Razor...unique.peptides.HL_540_LE_E2_A_02.raw.thermo.raw" 
## [12] "Razor...unique.peptides.HL_540_LE_E2_A_03.raw.thermo.raw" 
## [13] "Razor...unique.peptides.HL_540_LE_E2_B_01.raw.thermo.raw" 
## [14] "Razor...unique.peptides.HL_540_LE_E2_B_02.raw.thermo.raw" 
## [15] "Razor...unique.peptides.HL_540_LE_E2_B_03.raw.thermo.raw" 
## [16] "Razor...unique.peptides.HL_540_LE_RE1_A_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_LE_RE1_A_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_LE_RE1_A_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_LE_RE1_B_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_LE_RE1_B_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_LE_RE1_B_03.raw.thermo.raw"
## [22] "Razor...unique.peptides.HL_540_LE_RE1_C_01.raw.thermo.raw"
## [23] "Razor...unique.peptides.HL_540_LE_RE1_C_02.raw.thermo.raw"
## [24] "Razor...unique.peptides.HL_540_LE_RE1_C_03.raw.thermo.raw"
## [25] "Razor...unique.peptides.HL_540_LE_RE2_A_01.raw.thermo.raw"
## [26] "Razor...unique.peptides.HL_540_LE_RE2_A_02.raw.thermo.raw"
## [27] "Razor...unique.peptides.HL_540_LE_RE2_A_03.raw.thermo.raw"
## [28] "Razor...unique.peptides.HL_540_LE_RE2_B_01.raw.thermo.raw"
## [29] "Razor...unique.peptides.HL_540_LE_RE2_B_02.raw.thermo.raw"
## [30] "Razor...unique.peptides.HL_540_LE_RE2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
  count     = rowMins(as.matrix(df.prot[-removed_col_indices, c(86:100, 116:130)])),
  row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1

protein.matrix <- as.matrix(df.LFQ.filter)

class     <- factor(c(rep("E", 15), rep("RE", 15)))
replicate <- factor(c(rep("rep1", 9), rep("rep2", 6),
                      rep("rep1", 9), rep("rep2", 6)))

design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classE", "classRE")

fit_result <- run_deqms_pipeline(
  protein.matrix, design, pep.count.table,
  contrast_string = "classE-classRE"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 8.0366e-17

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names

timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
          file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_le/data_", timestamp, ".csv"))

processed_df <- DEqMS.results %>%
  dplyr::filter(!is.na(logFC)) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
DT::datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
  dplyr::filter(Gene.name %in% proteins_of_interest)
DT::datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "E vs RE", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up    <- sum(sig$logFC >=  1.0, na.rm = TRUE)
n_down  <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 17
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 14
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 31

9 R vs RE

############################################################
# R vs RE
############################################################
df <- read.csv("./results_maxquant/hl540_le_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]

df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)

df.LFQ <- df.prot[, c(350:364, 365:379)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
##  [1] "LFQ.intensity.HL_540_LE_R1_A_01.raw.thermo.raw" 
##  [2] "LFQ.intensity.HL_540_LE_R1_A_02.raw.thermo.raw" 
##  [3] "LFQ.intensity.HL_540_LE_R1_A_03.raw.thermo.raw" 
##  [4] "LFQ.intensity.HL_540_LE_R1_B_01.raw.thermo.raw" 
##  [5] "LFQ.intensity.HL_540_LE_R1_B_02.raw.thermo.raw" 
##  [6] "LFQ.intensity.HL_540_LE_R1_B_03.raw.thermo.raw" 
##  [7] "LFQ.intensity.HL_540_LE_R1_C_01.raw.thermo.raw" 
##  [8] "LFQ.intensity.HL_540_LE_R1_C_02.raw.thermo.raw" 
##  [9] "LFQ.intensity.HL_540_LE_R1_C_03.raw.thermo.raw" 
## [10] "LFQ.intensity.HL_540_LE_R2_A_01.raw.thermo.raw" 
## [11] "LFQ.intensity.HL_540_LE_R2_A_02.raw.thermo.raw" 
## [12] "LFQ.intensity.HL_540_LE_R2_A_03.raw.thermo.raw" 
## [13] "LFQ.intensity.HL_540_LE_R2_B_01.raw.thermo.raw" 
## [14] "LFQ.intensity.HL_540_LE_R2_B_02.raw.thermo.raw" 
## [15] "LFQ.intensity.HL_540_LE_R2_B_03.raw.thermo.raw" 
## [16] "LFQ.intensity.HL_540_LE_RE1_A_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_LE_RE1_A_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_LE_RE1_A_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_LE_RE1_B_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_LE_RE1_B_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_LE_RE1_B_03.raw.thermo.raw"
## [22] "LFQ.intensity.HL_540_LE_RE1_C_01.raw.thermo.raw"
## [23] "LFQ.intensity.HL_540_LE_RE1_C_02.raw.thermo.raw"
## [24] "LFQ.intensity.HL_540_LE_RE1_C_03.raw.thermo.raw"
## [25] "LFQ.intensity.HL_540_LE_RE2_A_01.raw.thermo.raw"
## [26] "LFQ.intensity.HL_540_LE_RE2_A_02.raw.thermo.raw"
## [27] "LFQ.intensity.HL_540_LE_RE2_A_03.raw.thermo.raw"
## [28] "LFQ.intensity.HL_540_LE_RE2_B_01.raw.thermo.raw"
## [29] "LFQ.intensity.HL_540_LE_RE2_B_02.raw.thermo.raw"
## [30] "LFQ.intensity.HL_540_LE_RE2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)

keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)

df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)

result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))

df.LFQ$valid_A <- apply(df.LFQ[, 1:15], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 16:30], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 2 | df.LFQ$valid_B >= 2, 1:30]

print(colnames(df.prot[, c(101:115, 116:130)]))
##  [1] "Razor...unique.peptides.HL_540_LE_R1_A_01.raw.thermo.raw" 
##  [2] "Razor...unique.peptides.HL_540_LE_R1_A_02.raw.thermo.raw" 
##  [3] "Razor...unique.peptides.HL_540_LE_R1_A_03.raw.thermo.raw" 
##  [4] "Razor...unique.peptides.HL_540_LE_R1_B_01.raw.thermo.raw" 
##  [5] "Razor...unique.peptides.HL_540_LE_R1_B_02.raw.thermo.raw" 
##  [6] "Razor...unique.peptides.HL_540_LE_R1_B_03.raw.thermo.raw" 
##  [7] "Razor...unique.peptides.HL_540_LE_R1_C_01.raw.thermo.raw" 
##  [8] "Razor...unique.peptides.HL_540_LE_R1_C_02.raw.thermo.raw" 
##  [9] "Razor...unique.peptides.HL_540_LE_R1_C_03.raw.thermo.raw" 
## [10] "Razor...unique.peptides.HL_540_LE_R2_A_01.raw.thermo.raw" 
## [11] "Razor...unique.peptides.HL_540_LE_R2_A_02.raw.thermo.raw" 
## [12] "Razor...unique.peptides.HL_540_LE_R2_A_03.raw.thermo.raw" 
## [13] "Razor...unique.peptides.HL_540_LE_R2_B_01.raw.thermo.raw" 
## [14] "Razor...unique.peptides.HL_540_LE_R2_B_02.raw.thermo.raw" 
## [15] "Razor...unique.peptides.HL_540_LE_R2_B_03.raw.thermo.raw" 
## [16] "Razor...unique.peptides.HL_540_LE_RE1_A_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_LE_RE1_A_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_LE_RE1_A_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_LE_RE1_B_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_LE_RE1_B_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_LE_RE1_B_03.raw.thermo.raw"
## [22] "Razor...unique.peptides.HL_540_LE_RE1_C_01.raw.thermo.raw"
## [23] "Razor...unique.peptides.HL_540_LE_RE1_C_02.raw.thermo.raw"
## [24] "Razor...unique.peptides.HL_540_LE_RE1_C_03.raw.thermo.raw"
## [25] "Razor...unique.peptides.HL_540_LE_RE2_A_01.raw.thermo.raw"
## [26] "Razor...unique.peptides.HL_540_LE_RE2_A_02.raw.thermo.raw"
## [27] "Razor...unique.peptides.HL_540_LE_RE2_A_03.raw.thermo.raw"
## [28] "Razor...unique.peptides.HL_540_LE_RE2_B_01.raw.thermo.raw"
## [29] "Razor...unique.peptides.HL_540_LE_RE2_B_02.raw.thermo.raw"
## [30] "Razor...unique.peptides.HL_540_LE_RE2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
  count     = rowMins(as.matrix(df.prot[-removed_col_indices, c(86:100, 116:130)])),  # as provided
  row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1

protein.matrix <- as.matrix(df.LFQ.filter)

class     <- factor(c(rep("R", 15), rep("RE", 15)))
replicate <- factor(c(rep("rep1", 9), rep("rep2", 6),
                      rep("rep1", 9), rep("rep2", 6)))

design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classR", "classRE")

fit_result <- run_deqms_pipeline(
  protein.matrix, design, pep.count.table,
  contrast_string = "classR-classRE"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 2.3143e-16

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names

timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
          file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_le/data_", timestamp, ".csv"))

processed_df <- DEqMS.results %>%
  dplyr::filter(!is.na(logFC)) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
DT::datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
  dplyr::filter(Gene.name %in% proteins_of_interest)
DT::datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "R vs RE", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up    <- sum(sig$logFC >=  1.0, na.rm = TRUE)
n_down  <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 29
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 22
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 51